This project had me working with Kesava Asam under Brad Aouizerat to perform a scRNA-seq analysis of pancreatic islet data from patients under differing diabetic/non-diabetic status. A recent publication by Wang et al., of which this data was sourced, investigated beta cell dysfunction in type 2 diabetes. Thus, I was directed and guided to obtain the data from this experiment and perform an integrated analysis on the beta cells, utilizing Harmony for integration and SingleR for cell type identity annotation. From this data, we were able to perform a differential gene expression analysis.
This document will consist of the execution of Seurat’s scRNA-seq analysis on pancreatic islet data collected from patients of varying diabetic condition, which may be found under the accession number GSE200044. However, this version will utilize Harmony instead of Seurat’s integration.
The data found for this experiment will be subset to focus on the Non-diabetic (ND) and Type-2 Diabetic (T2D) patients, with the purpose of performing/confirming quality control, performing an integrated analysis, annotating clusters with SingleR, and comparing the data to find cell-type specific responses between ND and T2D.
Note that due to due to Brightspace’s upload size limit, the original “multiome_RNA_raw.h5ad” cannot be included in this submission. To re-run this RMarkdown document, create a folder named “GSE200044_Files” containing the GSE200044_cell_cluster file from the GEO page for GSE200044 and place it in the same project directory as this file. The path should read as shown in the section “Convert from h5ad to seurat object”.
library(SeuratDisk) # To convert h5ad to h5seurat
library(tidyverse) # Manipulation of the dataframes
library(janitor) # clean_names()
library(Seurat) # scRNA sequences
library(SeuratData)
library(ggplot2) #figures
library(GEOquery) # to get the phenotype
library(here) # to avoid setting working directory
library(SingleR) #To perform automated cell type annotation
library(celldex) #For reference expression datasets for sc-annotation
library(scRNAseq) #for use with providing reference dataset for annotation
library(ExperimentHub) #for assisting with annotation
library(scater) #for sc-analysis tools
library(harmony) #for integration
library(patchwork)
library(gridExtra) #for plotting more conveniently
library(scran) #For enabling further use of SingleR
library(pheatmap) #for heatmap generation
# Get the phenotype data from the Geo
gseGEO_test <- getGEO("GSE200044", GSEMatrix = F)
gseGEO <- getGEO("GSE200044")
gse <- gseGEO[[1]]
# Get the phenotype file
pheno_dft <-
pData(gse)
# select required columns
pheno <-
pheno_dft %>%
dplyr::select(c(geo_accession, contains(":ch1"), library_strategy)) %>%
janitor::clean_names()
# rename the columns to remove un-necessary characters
names(pheno) <-
sub("_ch1", "", names(pheno))
# subset only RNA data
pheno_rna <-
pheno %>%
filter(library_strategy == "RNA-Seq")
This code was used to obtain the appropriate GEO phenotype data, which will be merged and used with the Seurat object created in the following step.
# convert the h5ad file into h5seurat file format
#This only needs to be performed once, to create the original Seurat object
SeuratDisk::Convert(here("GSE200044_Files/cell_cluster/multiome_RNA_raw.h5ad"),
dest = "h5seurat",
assay = "RNA")
# Make seurat object of all samples together that are in the file
merged_data <- LoadH5Seurat(here(
"GSE200044_Files/cell_cluster/multiome_RNA_raw.h5Seurat"))
#Rename columns for clarity/consistency and merge the phenotypic data
# create a sample column
merged_data$row_names <- rownames(merged_data@meta.data)
# add the subsetted metadata downloaded from geo
merged_data@meta.data <-
merge(merged_data@meta.data, pheno_rna,
by.x = "donor", by.y = "sample_id")
#Rename counts and genes columns
names(merged_data@meta.data)[names(merged_data@meta.data)=='n_counts'] <-
'nCount_RNA'
names(merged_data@meta.data)[names(merged_data@meta.data)=='n_genes'] <-
'nFeature_RNA'
# check the merged metadata
merged_data@meta.data %>%
head()
## donor percent_mito nCount_RNA log10_n_counts log_n_counts nFeature_RNA
## 1 A0011 0.0008646779 4626 3.665206 8.439447 2237
## 2 A0011 0.0002760905 7244 3.859978 8.887929 3073
## 3 A0011 0.0000000000 5677 3.754119 8.644178 2459
## 4 A0011 0.0000000000 5130 3.710117 8.542861 2036
## 5 A0011 0.0001866368 5358 3.729003 8.586346 2144
## 6 A0011 0.0000000000 3515 3.545925 8.164795 1727
## log10_n_genes batch row_names geo_accession age bmi
## 1 3.349666 0 A0011_AAACAGCCACAGCCTG-1 GSM6005279 34 31.5
## 2 3.487563 0 A0011_AAACAGCCACTAGGTC-1 GSM6005279 34 31.5
## 3 3.390759 0 A0011_AAACCAACAAATTGCT-1 GSM6005279 34 31.5
## 4 3.308778 0 A0011_AAACCGAAGCAGGTGG-1 GSM6005279 34 31.5
## 5 3.331225 0 A0011_AAACCGAAGTAATCCA-1 GSM6005279 34 31.5
## 6 3.237292 0 A0011_AAACCGAAGTGAAGTG-1 GSM6005279 34 31.5
## disease_state gender hba1c islet_index library_id purity race
## 1 Pre-T2D F 5.7 1.3 MM755 0.8 Caucasian
## 2 Pre-T2D F 5.7 1.3 MM755 0.8 Caucasian
## 3 Pre-T2D F 5.7 1.3 MM755 0.8 Caucasian
## 4 Pre-T2D F 5.7 1.3 MM755 0.8 Caucasian
## 5 Pre-T2D F 5.7 1.3 MM755 0.8 Caucasian
## 6 Pre-T2D F 5.7 1.3 MM755 0.8 Caucasian
## library_strategy
## 1 RNA-Seq
## 2 RNA-Seq
## 3 RNA-Seq
## 4 RNA-Seq
## 5 RNA-Seq
## 6 RNA-Seq
# re-add the rownames
merged_data@meta.data <-
merged_data@meta.data %>%
column_to_rownames("row_names")
# check the metadata
merged_data@meta.data %>% head()
## donor percent_mito nCount_RNA log10_n_counts
## A0011_AAACAGCCACAGCCTG-1 A0011 0.0008646779 4626 3.665206
## A0011_AAACAGCCACTAGGTC-1 A0011 0.0002760905 7244 3.859978
## A0011_AAACCAACAAATTGCT-1 A0011 0.0000000000 5677 3.754119
## A0011_AAACCGAAGCAGGTGG-1 A0011 0.0000000000 5130 3.710117
## A0011_AAACCGAAGTAATCCA-1 A0011 0.0001866368 5358 3.729003
## A0011_AAACCGAAGTGAAGTG-1 A0011 0.0000000000 3515 3.545925
## log_n_counts nFeature_RNA log10_n_genes batch
## A0011_AAACAGCCACAGCCTG-1 8.439447 2237 3.349666 0
## A0011_AAACAGCCACTAGGTC-1 8.887929 3073 3.487563 0
## A0011_AAACCAACAAATTGCT-1 8.644178 2459 3.390759 0
## A0011_AAACCGAAGCAGGTGG-1 8.542861 2036 3.308778 0
## A0011_AAACCGAAGTAATCCA-1 8.586346 2144 3.331225 0
## A0011_AAACCGAAGTGAAGTG-1 8.164795 1727 3.237292 0
## geo_accession age bmi disease_state gender hba1c
## A0011_AAACAGCCACAGCCTG-1 GSM6005279 34 31.5 Pre-T2D F 5.7
## A0011_AAACAGCCACTAGGTC-1 GSM6005279 34 31.5 Pre-T2D F 5.7
## A0011_AAACCAACAAATTGCT-1 GSM6005279 34 31.5 Pre-T2D F 5.7
## A0011_AAACCGAAGCAGGTGG-1 GSM6005279 34 31.5 Pre-T2D F 5.7
## A0011_AAACCGAAGTAATCCA-1 GSM6005279 34 31.5 Pre-T2D F 5.7
## A0011_AAACCGAAGTGAAGTG-1 GSM6005279 34 31.5 Pre-T2D F 5.7
## islet_index library_id purity race
## A0011_AAACAGCCACAGCCTG-1 1.3 MM755 0.8 Caucasian
## A0011_AAACAGCCACTAGGTC-1 1.3 MM755 0.8 Caucasian
## A0011_AAACCAACAAATTGCT-1 1.3 MM755 0.8 Caucasian
## A0011_AAACCGAAGCAGGTGG-1 1.3 MM755 0.8 Caucasian
## A0011_AAACCGAAGTAATCCA-1 1.3 MM755 0.8 Caucasian
## A0011_AAACCGAAGTGAAGTG-1 1.3 MM755 0.8 Caucasian
## library_strategy
## A0011_AAACAGCCACAGCCTG-1 RNA-Seq
## A0011_AAACAGCCACTAGGTC-1 RNA-Seq
## A0011_AAACCAACAAATTGCT-1 RNA-Seq
## A0011_AAACCGAAGCAGGTGG-1 RNA-Seq
## A0011_AAACCGAAGTAATCCA-1 RNA-Seq
## A0011_AAACCGAAGTGAAGTG-1 RNA-Seq
merged_data@meta.data %>%
tail()
## donor percent_mito nCount_RNA log10_n_counts
## C0027_TTTGTTGGTGAGGTAG-1 C0027 0.0642023310 2056 3.313023
## C0027_TTTGTTGGTGCTCCGT-1 C0027 0.0015812777 3162 3.499962
## C0027_TTTGTTGGTGTGTCCC-1 C0027 0.0007208073 4162 3.619302
## C0027_TTTGTTGGTTAGTGAT-1 C0027 0.0036553524 1915 3.282169
## C0027_TTTGTTGGTTTCGCGC-1 C0027 0.0008949881 3352 3.525304
## C0027_TTTGTTGGTTTGACCT-1 C0027 0.0002717391 7360 3.866878
## log_n_counts nFeature_RNA log10_n_genes batch
## C0027_TTTGTTGGTGAGGTAG-1 7.628518 1309 3.116940 19
## C0027_TTTGTTGGTGCTCCGT-1 8.058960 1701 3.230704 19
## C0027_TTTGTTGGTGTGTCCC-1 8.333751 2118 3.325926 19
## C0027_TTTGTTGGTTAGTGAT-1 7.557473 1212 3.083503 19
## C0027_TTTGTTGGTTTCGCGC-1 8.117312 1839 3.264582 19
## C0027_TTTGTTGGTTTGACCT-1 8.903815 3011 3.478711 19
## geo_accession age bmi disease_state gender hba1c
## C0027_TTTGTTGGTGAGGTAG-1 GSM6005283 24 23.9 Non-diabetic M 5
## C0027_TTTGTTGGTGCTCCGT-1 GSM6005283 24 23.9 Non-diabetic M 5
## C0027_TTTGTTGGTGTGTCCC-1 GSM6005283 24 23.9 Non-diabetic M 5
## C0027_TTTGTTGGTTAGTGAT-1 GSM6005283 24 23.9 Non-diabetic M 5
## C0027_TTTGTTGGTTTCGCGC-1 GSM6005283 24 23.9 Non-diabetic M 5
## C0027_TTTGTTGGTTTGACCT-1 GSM6005283 24 23.9 Non-diabetic M 5
## islet_index library_id purity race library_strategy
## C0027_TTTGTTGGTGAGGTAG-1 1.7 MM729 0.95 White RNA-Seq
## C0027_TTTGTTGGTGCTCCGT-1 1.7 MM729 0.95 White RNA-Seq
## C0027_TTTGTTGGTGTGTCCC-1 1.7 MM729 0.95 White RNA-Seq
## C0027_TTTGTTGGTTAGTGAT-1 1.7 MM729 0.95 White RNA-Seq
## C0027_TTTGTTGGTTTCGCGC-1 1.7 MM729 0.95 White RNA-Seq
## C0027_TTTGTTGGTTTGACCT-1 1.7 MM729 0.95 White RNA-Seq
The samples under each condition are shown below.
We will proceed focusing on just the ND and T2D groups.
Non-diabetic: C0026, C0027, A0019, A0033, A0027, C0025
Pre-T2D: A0011, A0028, A0029, C0013, C0014, A0030, A0021, C0022
T2D: C0019, C0024, C0021, A0024, A0031, C0023
#Check disease state labels
unique(merged_data$disease_state)
## [1] "Pre-T2D" "Non-diabetic" "T2D"
#Check number of each disease state
length(which(merged_data$disease_state=='Pre-T2D'))
## [1] 30702
length(which(merged_data$disease_state=='Non-diabetic'))
## [1] 21012
length(which(merged_data$disease_state=='T2D'))
## [1] 33552
#Check amount of M vs F
length(which(merged_data$gender=='M'))
## [1] 61756
length(which(merged_data$gender=='F'))
## [1] 23510
#Subset, not including any rows including the Pre-T2D disease state
merged_data_ND_T2D <- subset(merged_data, disease_state != 'Pre-T2D')
#Display the first few entries
head(merged_data_ND_T2D$disease_state)
## A0019_AAACAGCCAGGACCAA-1 A0019_AAACAGCCATGAGTTT-1 A0019_AAACATGCAGCTTAAT-1
## "Non-diabetic" "Non-diabetic" "Non-diabetic"
## A0019_AAACCAACACTATGGC-1 A0019_AAACCGAAGGGACTAA-1 A0019_AAACGCGCAGCTAACC-1
## "Non-diabetic" "Non-diabetic" "Non-diabetic"
#Confirm that Pre-T2D entries have been removed
unique(merged_data_ND_T2D$disease_state)
## [1] "Non-diabetic" "T2D"
#Sanity check for object new object's class
class(merged_data_ND_T2D)
## [1] "Seurat"
## attr(,"package")
## [1] "SeuratObject"
#Use the PercentagegFeatureSet() function to identify the percentage of
# counts originating from mitochondrial genes identified with '^MT-'
merged_data_ND_T2D$my_percent_mito <-
PercentageFeatureSet(merged_data_ND_T2D, pattern = '^MT-')
#use view(merged_GSE_seurat) to see results more clearly
head(merged_data_ND_T2D$my_percent_mito)
## A0019_AAACAGCCAGGACCAA-1 A0019_AAACAGCCATGAGTTT-1 A0019_AAACATGCAGCTTAAT-1
## 0.12131015 0.04024145 0.01697505
## A0019_AAACCAACACTATGGC-1 A0019_AAACCGAAGGGACTAA-1 A0019_AAACGCGCAGCTAACC-1
## 0.02262443 0.04752852 0.03916193
#Visualize QC metrics before filtering
VlnPlot(merged_data_ND_T2D, features = c("nFeature_RNA",
"nCount_RNA",
"my_percent_mito"),
ncol = 3)
#Visualize relationships with FeatureScatter
plot1 <- FeatureScatter(merged_data_ND_T2D, feature1 = "nCount_RNA",
feature2 = "my_percent_mito")
plot2 <- FeatureScatter(merged_data_ND_T2D, feature1 = "nCount_RNA",
feature2 = "nFeature_RNA")
plot1 + plot2
Fortunately, in the FeatureScatter plot for nFeature_RNA, we do not see any points in the top left or bottom right, which would indicate that the experiment captured a high number of genes that are not deeply sequenced, or the experiment has only captured a few genes which have been sequenced over and over again, respectively.
Percent mitochondrial transcripts already seems to be tidy and have a limit of 10%, and the nFeature_RNA plot similarly has no obvious outliers in its distribution. It seems likely that the data obtained from the GEO page has already been somewhat processed.
Percent mitochondrial seems to have already been pre-processed in the source file to have a cutoff of 10%, which I agree with referencing the following article: This article suggests a 10% cutoff as appropriate for human tissues, rather than 5%
This is also consistent with the cutoff chosen in the
However, we will filter at 10% regardless to confirm.
There are also no obvious visual outliers in the nFeature_RNA graph, so we will just use a cutoff for the lower end, requiring a minimum of nFeature_RNA > 200 as is the standard.
#The data already appears to have been filtered satisfactorily to these settings,
# but we shall run it anyways to be certain
merged_data_ND_T2D_filtered <- subset(merged_data_ND_T2D,
subset = nFeature_RNA > 200 &
my_percent_mito < 10)
#normalize data
merged_data_ND_T2D_filtered <-
NormalizeData(object = merged_data_ND_T2D_filtered)
#find variable features
merged_data_ND_T2D_filtered <-
FindVariableFeatures(object = merged_data_ND_T2D_filtered)
#Scale data
merged_data_ND_T2D_filtered <-
ScaleData(object = merged_data_ND_T2D_filtered)
#Perform dimensionality reduction
merged_data_ND_T2D_filtered <-
RunPCA(object = merged_data_ND_T2D_filtered)
We can visualize the most highly variable genes found by FindVariableFeatures.
#Identify and plot the top 10 highly variable genes
top10 <- head(VariableFeatures(merged_data_ND_T2D_filtered), 10)
top10_plot <- VariableFeaturePlot(merged_data_ND_T2D_filtered)
LabelPoints(plot = top10_plot, points = top10, repel = TRUE)
We can use Elbow Plot to approximate the dimensionality. It is a commonly used heuristic and is much faster than methods such as JackStraw.
#Check dimensionality to 50 dims
ElbowPlot(merged_data_ND_T2D_filtered, ndims = 50)
We will use a Standard Deviation of 1 to select our cutoff, which appears to be approximately 40. We will use 40 PCs moving forward.
This cutoff was chosen due to there still being a decline after the “more obvious elbow” around 12 PCs until around 40, as well as to be a bit more liberal because Elbow Plots can be visually subjective and underestimate the dimensionality.
We can also create a pre-integration plot to compare to post-integration.
#Run UMAP and display the pre-integration plot
merged_data_ND_T2D_filtered <-
RunUMAP(merged_data_ND_T2D_filtered, dims = 1:40, reduction = 'pca')
before <-
DimPlot(merged_data_ND_T2D_filtered,
reduction = 'umap', group.by = "disease_state")
We will use Harmony to perform integration and perform an integrated analysis.
ND_T2D.harmony <- merged_data_ND_T2D_filtered %>%
RunHarmony(group.by.vars = "disease_state", plot_convergence = FALSE)
#We can check the reductions in the harmony object
ND_T2D.harmony@reductions
## $pca
## A dimensional reduction object with key PC_
## Number of dimensions: 50
## Projected dimensional reduction calculated: FALSE
## Jackstraw run: FALSE
## Computed using assay: RNA
##
## $umap
## A dimensional reduction object with key UMAP_
## Number of dimensions: 2
## Projected dimensional reduction calculated: FALSE
## Jackstraw run: FALSE
## Computed using assay: RNA
##
## $harmony
## A dimensional reduction object with key harmony_
## Number of dimensions: 50
## Projected dimensional reduction calculated: TRUE
## Jackstraw run: FALSE
## Computed using assay: RNA
#We will specify our harmony reductions in the future when doing clustering
#Let's also save our harmony embeddings
ND_T2D.harmony.embed <- Embeddings(ND_T2D.harmony, "harmony")
ND_T2D.harmony <- ND_T2D.harmony %>%
RunUMAP(reduction = 'harmony', dims = 1:40) %>%
FindNeighbors(reduction = 'harmony', dims = 1:40) %>%
FindClusters(resolution = 0.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 54564
## Number of edges: 1883743
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.9224
## Number of communities: 21
## Elapsed time: 11 seconds
ScaleData() and RunPCA() are not used here, since Harmony will take over those steps, and we use Harmony embeddings instead of PCA for the UMAP and clustering.
The original article used a resolution of 1.5, using all of the original data.
We can take a look at Harmony’s clustering at that resolution as well as at 1.0, which may be interesting because our dataset is a subset of what was performed in the paper.
ND_T2D.harmony <- ND_T2D.harmony %>%
FindClusters(resolution = 1.0)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 54564
## Number of edges: 1883743
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8932
## Number of communities: 30
## Elapsed time: 12 seconds
ND_T2D.harmony <- ND_T2D.harmony %>%
FindClusters(resolution = 1.5)
## Modularity Optimizer version 1.3.0 by Ludo Waltman and Nees Jan van Eck
##
## Number of nodes: 54564
## Number of edges: 1883743
##
## Running Louvain algorithm...
## Maximum modularity in 10 random starts: 0.8701
## Number of communities: 34
## Elapsed time: 11 seconds
#We use the umap reduction here, since it was calculated from Harmony in the
## previous step
after <- DimPlot(ND_T2D.harmony, reduction = 'umap', group.by = 'disease_state')
#plot
before|after
# Visualization of clusters
p1 <- DimPlot(ND_T2D.harmony, reduction = "umap", group.by = "disease_state")
p2 <- DimPlot(ND_T2D.harmony, reduction = "umap", label = TRUE, repel = TRUE)
p1 + p2
#Assign identity of clusters
Idents(object = ND_T2D.harmony) <- "RNA_snn_res.0.5"
DimPlot(ND_T2D.harmony, reduction = "umap", split.by = "disease_state",
label = TRUE)
#Assign identity of clusters
Idents(object = ND_T2D.harmony) <- "RNA_snn_res.1"
#Plot UMAP
DimPlot(ND_T2D.harmony,
reduction = "umap",
label = TRUE)
#Assign identity of clusters
Idents(object = ND_T2D.harmony) <- "RNA_snn_res.1.5"
#Plot UMAP
DimPlot(ND_T2D.harmony,
reduction = "umap",
label = TRUE)
We can also check the Harmony assays again to see the top 10 variable features
ND_T2D.harmony@assays
## $RNA
## Assay data with 58686 features for 54564 cells
## Top 10 variable features:
## SST, CFTR, REG1A, REG1B, KCND2, AGPAT5, FGF13, BNC2, CTRB2, PPY
Let’s proceed with the 1.5 resolution the article chose, since we have a high number of cells (54.5k).
The Ident will remain as the last one that was set, which is the 1.5 resolution in our case.
We will use SingleR to propagate the marker gene definition and cluster interpretation from the Baron Pancreas Dataset to our own data.
This will allow the annotation process to be swift and automated, which improves upon the speed and accuracy when compared to manual annotation.
#Load Baron Human pancreas dataset
Baron_Pancreas <- BaronPancreasData('human')
Baron_Pancreas
## class: SingleCellExperiment
## dim: 20125 8569
## metadata(0):
## assays(1): counts
## rownames(20125): A1BG A1CF ... ZZZ3 pk
## rowData names(0):
## colnames(8569): human1_lib1.final_cell_0001 human1_lib1.final_cell_0002
## ... human4_lib3.final_cell_0700 human4_lib3.final_cell_0701
## colData names(2): donor label
## reducedDimNames(0):
## mainExpName: NULL
## altExpNames(0):
#Remove the unlabeled libraries here
Baron_Pancreas <- Baron_Pancreas[,!is.na(Baron_Pancreas$label)]
#SingleR expects normalized and log-transformed reference datasets
Baron_Pancreas <- logNormCounts(Baron_Pancreas)
This is performed after our clustering from integration. However, we will make sure to be using the RNA assay instead of the Integrated assay, since we will soon be looking at the differentially expressed genes.
This will be performed after the integrated analysis, where we would typically perform the “conserved cell type markers” step.
Since SingleR will annotate cell labels automatically for our dataset, let’s use it now with the Baron pancreatic islet dataset as reference.
#Now perform singleR using the same test dataset, but using Baron as ref
#Ensure we are using RNA assay for differential expression after integration
DefaultAssay(ND_T2D.harmony) <- "RNA"
singleR_results_ND_T2D <- SingleR(test = as.SingleCellExperiment(ND_T2D.harmony),
ref = Baron_Pancreas, labels = Baron_Pancreas$label,
de.method = "wilcox")
table(singleR_results_ND_T2D$labels)
##
## acinar activated_stellate alpha beta
## 2235 189 21136 20806
## delta ductal endothelial epsilon
## 3498 1338 120 244
## gamma macrophage mast quiescent_stellate
## 4336 163 15 421
## schwann t_cell
## 56 7
#Append metadata with these new labels
ND_T2D.harmony$Baron_singleR_labels <- singleR_results_ND_T2D$labels
#and check
head(ND_T2D.harmony[[]])
## donor percent_mito nCount_RNA log10_n_counts
## A0019_AAACAGCCAGGACCAA-1 A0019 0.0012131016 4946 3.694254
## A0019_AAACAGCCATGAGTTT-1 A0019 0.0004024145 4970 3.696356
## A0019_AAACATGCAGCTTAAT-1 A0019 0.0001697505 5891 3.770189
## A0019_AAACCAACACTATGGC-1 A0019 0.0002262443 4420 3.645422
## A0019_AAACCGAAGGGACTAA-1 A0019 0.0004752852 4208 3.624076
## A0019_AAACGCGCAGCTAACC-1 A0019 0.0003916193 5107 3.708166
## log_n_counts nFeature_RNA log10_n_genes batch
## A0019_AAACAGCCAGGACCAA-1 8.506334 2140 3.330414 1
## A0019_AAACAGCCATGAGTTT-1 8.511175 2173 3.337060 1
## A0019_AAACATGCAGCTTAAT-1 8.681181 2438 3.387034 1
## A0019_AAACCAACACTATGGC-1 8.393895 1931 3.285782 1
## A0019_AAACCGAAGGGACTAA-1 8.344743 1611 3.207096 1
## A0019_AAACGCGCAGCTAACC-1 8.538367 2310 3.363612 1
## geo_accession age bmi disease_state gender hba1c
## A0019_AAACAGCCAGGACCAA-1 GSM6005286 61 27 Non-diabetic M 5.6
## A0019_AAACAGCCATGAGTTT-1 GSM6005286 61 27 Non-diabetic M 5.6
## A0019_AAACATGCAGCTTAAT-1 GSM6005286 61 27 Non-diabetic M 5.6
## A0019_AAACCAACACTATGGC-1 GSM6005286 61 27 Non-diabetic M 5.6
## A0019_AAACCGAAGGGACTAA-1 GSM6005286 61 27 Non-diabetic M 5.6
## A0019_AAACGCGCAGCTAACC-1 GSM6005286 61 27 Non-diabetic M 5.6
## islet_index library_id purity race
## A0019_AAACAGCCAGGACCAA-1 1.02 MM754 0.8 Caucasian
## A0019_AAACAGCCATGAGTTT-1 1.02 MM754 0.8 Caucasian
## A0019_AAACATGCAGCTTAAT-1 1.02 MM754 0.8 Caucasian
## A0019_AAACCAACACTATGGC-1 1.02 MM754 0.8 Caucasian
## A0019_AAACCGAAGGGACTAA-1 1.02 MM754 0.8 Caucasian
## A0019_AAACGCGCAGCTAACC-1 1.02 MM754 0.8 Caucasian
## library_strategy my_percent_mito RNA_snn_res.0.5
## A0019_AAACAGCCAGGACCAA-1 RNA-Seq 0.12131015 0
## A0019_AAACAGCCATGAGTTT-1 RNA-Seq 0.04024145 0
## A0019_AAACATGCAGCTTAAT-1 RNA-Seq 0.01697505 3
## A0019_AAACCAACACTATGGC-1 RNA-Seq 0.02262443 0
## A0019_AAACCGAAGGGACTAA-1 RNA-Seq 0.04752852 7
## A0019_AAACGCGCAGCTAACC-1 RNA-Seq 0.03916193 1
## seurat_clusters RNA_snn_res.1 RNA_snn_res.1.5
## A0019_AAACAGCCAGGACCAA-1 1 2 1
## A0019_AAACAGCCATGAGTTT-1 7 6 7
## A0019_AAACATGCAGCTTAAT-1 3 4 3
## A0019_AAACCAACACTATGGC-1 7 6 7
## A0019_AAACCGAAGGGACTAA-1 20 9 20
## A0019_AAACGCGCAGCTAACC-1 4 3 4
## Baron_singleR_labels
## A0019_AAACAGCCAGGACCAA-1 beta
## A0019_AAACAGCCATGAGTTT-1 delta
## A0019_AAACATGCAGCTTAAT-1 alpha
## A0019_AAACCAACACTATGGC-1 beta
## A0019_AAACCGAAGGGACTAA-1 ductal
## A0019_AAACGCGCAGCTAACC-1 alpha
#Plot labels and clusters with DimPlot
DimPlot(ND_T2D.harmony, reduction = 'umap',
group.by = 'Baron_singleR_labels', label = T, repel = T)
We can also display a heatmap of the score matrix from SingleR, which will indicate our level of certainty of our assignments.
plotScoreHeatmap(singleR_results_ND_T2D)
While our heatmap could be more distinct, there is easily visible distinction in the difference in scores for the cell types that make up a bulk of the data, such as alpha, beta, gamma, and delta cells.
Since there is clear difference in scores for alpha and beta cells compared to the rest, we can proceed with further analysis on these cells.
#Also look at delta distribution
#Labels with very low deltas may need to be cautiously interpreted
plotDeltaDistribution(singleR_results_ND_T2D)
#Prepare a column with the labels and condition combined for use in FindMarkers
ND_T2D.harmony$celltype.cnd <- paste(ND_T2D.harmony$Baron_singleR_labels, "_",
ND_T2D.harmony$disease_state)
#Set Idents to this new column
Idents(ND_T2D.harmony) <- ND_T2D.harmony$celltype.cnd
Lets plot our newly labeled ND_T2D.harmony, showing our ND and T2D conditions for each of the cluster identities.
#Check plot of current state of ND_T2D.harmony
DimPlot(ND_T2D.harmony, reduction = "umap", label = TRUE)
We will now use FindMarkers to leverage the updated identities of the cells and compare the diabetic conditions
beta_t2d_response <- FindMarkers(ND_T2D.harmony, ident.1 = "beta _ T2D",
ident.2 = "beta _ Non-diabetic")
#Let's inspect the results
head(beta_t2d_response, n=10)
## p_val avg_log2FC pct.1 pct.2 p_val_adj
## PDE4B 0 1.3394440 0.432 0.142 0
## SGIP1 0 1.2773146 0.566 0.220 0
## MARK1 0 -0.8323666 0.510 0.712 0
## RYR2 0 1.3608610 0.657 0.317 0
## KIF26B 0 1.0611685 0.527 0.275 0
## EML6 0 0.8101765 0.675 0.465 0
## ZBTB20 0 0.4348783 0.995 0.988 0
## LSAMP 0 0.9120747 0.808 0.631 0
## CPNE4 0 1.3103470 0.555 0.323 0
## EGFEM1P 0 0.9493805 0.789 0.600 0
This object contains genes found to be differentially expressed when comparing between the ND and T2D conditions. The most biologically significant genes in this are the ones most up/down-regulated between the T2D and ND groups.
However, it is important to note the pct.1 and pct.2 signify the percentage of cells where the gene is detected, so the output shown above is ordered in such a way these are taken into consideration.
Let’s plot some of these DE features.
#Lets plot the differential expression of the markers found in the previous
# step between the two conditions
#This is amongst all clusters
#Plot the first four
FeaturePlot(ND_T2D.harmony, features = c("PDE4B", "SGIP1", "MARK1", "RYR2"),
split.by = "disease_state", min.cutoff = 'q10')
#And the next 3
FeaturePlot(ND_T2D.harmony, features = c("KIF26B", "EML6", "ZBTB20"),
split.by = "disease_state", min.cutoff = 'q10')
We can also look at some of the genes with the most dramatic up/down-regulation, regardless of pct.1 and pct.2.
If we check the beta_t2d_response object and look at the top 3 most positively and negative log2FC genes with extremely low p-values, we obtain the following log2FC:
(+) XIST: 1.718
PRUNE2: 1.393 RYR2: 1.361
(-) INS: -1.504 AGPAT5: -1.411 UTY: -1.160
Positive values for log2FC means the gene is more expressed in the T2D condition, and negative indicates the gene is lowly expressed in the T2D condition when compared to ND.
#Now lets first plot the upregulated genes
FeaturePlot(ND_T2D.harmony, features = c("XIST", "PRUNE2", "RYR2"),
split.by = "disease_state", min.cutoff = 'q10')
At first, we see an upregulation in XIST in the T2D condition, which at first seems odd since this gene is typically only expressed in females as it produces a long noncoding RNA that initiates chromosome-wide gene repression on the inactive X chromosome in mammalian females. However, this can be explained by the gender imbalance of the dataset that was inspected previously, with a roughly 5:2 ratio of M:F.
#Now lets plot the DOWNregulated genes
FeaturePlot(ND_T2D.harmony, features = c("INS", "AGPAT5", "UTY"),
split.by = "disease_state", min.cutoff = 'q10')
We can plot some additional genes of interest, which affect regulation of insulin.
#And lets plot TCF7L2 which affects insulin secretion, and ABCC8
# which helps regulate insulin
FeaturePlot(ND_T2D.harmony, features = c("TCF7L2", "ABCC8"),
split.by = "disease_state", min.cutoff = 'q10')
#And their avg log2FC
print("TCF7L2:")
## [1] "TCF7L2:"
beta_t2d_response["TCF7L2","avg_log2FC"]
## [1] 0.2579478
print("ABCC8:")
## [1] "ABCC8:"
beta_t2d_response["ABCC8","avg_log2FC"]
## [1] NA
We can also use Violin Plots to display these changes in gene expression.
violin_plot1 <- VlnPlot(ND_T2D.harmony, features = c("INS","MARK1","PRUNE2"),
split.by = "disease_state",
group.by = "Baron_singleR_labels",
pt.size = 0, combine = FALSE)
wrap_plots(plots = violin_plot1, ncol = 1)
sessionInfo()
## R version 4.3.1 (2023-06-16 ucrt)
## Platform: x86_64-w64-mingw32/x64 (64-bit)
## Running under: Windows 10 x64 (build 19045)
##
## Matrix products: default
##
##
## locale:
## [1] LC_COLLATE=English_United States.utf8
## [2] LC_CTYPE=English_United States.utf8
## [3] LC_MONETARY=English_United States.utf8
## [4] LC_NUMERIC=C
## [5] LC_TIME=English_United States.utf8
##
## time zone: America/Los_Angeles
## tzcode source: internal
##
## attached base packages:
## [1] stats4 stats graphics grDevices utils datasets methods
## [8] base
##
## other attached packages:
## [1] pheatmap_1.0.12 scran_1.28.2
## [3] gridExtra_2.3 patchwork_1.1.2
## [5] harmony_0.1.1 Rcpp_1.0.11
## [7] scater_1.28.0 scuttle_1.10.2
## [9] ExperimentHub_2.8.1 AnnotationHub_3.8.0
## [11] BiocFileCache_2.8.0 dbplyr_2.3.3
## [13] scRNAseq_2.14.0 SingleCellExperiment_1.22.0
## [15] celldex_1.10.1 SingleR_2.2.0
## [17] SummarizedExperiment_1.30.2 GenomicRanges_1.52.0
## [19] GenomeInfoDb_1.36.1 IRanges_2.34.1
## [21] S4Vectors_0.38.1 MatrixGenerics_1.12.3
## [23] matrixStats_1.0.0 here_1.0.1
## [25] GEOquery_2.68.0 Biobase_2.60.0
## [27] BiocGenerics_0.46.0 SeuratData_0.2.2
## [29] SeuratObject_4.1.3 Seurat_4.3.0.1
## [31] janitor_2.2.0 lubridate_1.9.2
## [33] forcats_1.0.0 stringr_1.5.0
## [35] dplyr_1.1.2 purrr_1.0.1
## [37] readr_2.1.4 tidyr_1.3.0
## [39] tibble_3.2.1 ggplot2_3.4.2
## [41] tidyverse_2.0.0 SeuratDisk_0.0.0.9020
##
## loaded via a namespace (and not attached):
## [1] ProtGenerics_1.32.0 spatstat.sparse_3.0-2
## [3] bitops_1.0-7 httr_1.4.6
## [5] RColorBrewer_1.1-3 tools_4.3.1
## [7] sctransform_0.3.5 utf8_1.2.3
## [9] R6_2.5.1 lazyeval_0.2.2
## [11] uwot_0.1.16 withr_2.5.0
## [13] sp_2.0-0 prettyunits_1.1.1
## [15] progressr_0.14.0 cli_3.6.1
## [17] spatstat.explore_3.2-1 labeling_0.4.2
## [19] sass_0.4.7 spatstat.data_3.0-1
## [21] ggridges_0.5.4 pbapply_1.7-2
## [23] Rsamtools_2.16.0 R.utils_2.12.2
## [25] parallelly_1.36.0 limma_3.56.2
## [27] rstudioapi_0.15.0 RSQLite_2.3.1
## [29] generics_0.1.3 BiocIO_1.10.0
## [31] ica_1.0-3 spatstat.random_3.1-5
## [33] Matrix_1.6-0 ggbeeswarm_0.7.2
## [35] fansi_1.0.4 abind_1.4-5
## [37] R.methodsS3_1.8.2 lifecycle_1.0.3
## [39] edgeR_3.42.4 yaml_2.3.7
## [41] snakecase_0.11.0 Rtsne_0.16
## [43] grid_4.3.1 blob_1.2.4
## [45] dqrng_0.3.0 promises_1.2.0.1
## [47] crayon_1.5.2 miniUI_0.1.1.1
## [49] lattice_0.21-8 beachmat_2.16.0
## [51] cowplot_1.1.1 GenomicFeatures_1.52.1
## [53] KEGGREST_1.40.0 metapod_1.8.0
## [55] pillar_1.9.0 knitr_1.43
## [57] rjson_0.2.21 future.apply_1.11.0
## [59] codetools_0.2-19 leiden_0.4.3
## [61] glue_1.6.2 data.table_1.14.8
## [63] vctrs_0.6.3 png_0.1-8
## [65] gtable_0.3.3 cachem_1.0.8
## [67] xfun_0.39 S4Arrays_1.0.5
## [69] mime_0.12 survival_3.5-5
## [71] statmod_1.5.0 bluster_1.10.0
## [73] interactiveDisplayBase_1.38.0 ellipsis_0.3.2
## [75] fitdistrplus_1.1-11 ROCR_1.0-11
## [77] nlme_3.1-163 bit64_4.0.5
## [79] progress_1.2.2 filelock_1.0.2
## [81] RcppAnnoy_0.0.21 rprojroot_2.0.3
## [83] bslib_0.5.1 irlba_2.3.5.1
## [85] vipor_0.4.5 KernSmooth_2.23-22
## [87] colorspace_2.1-0 DBI_1.1.3
## [89] ggrastr_1.0.2 tidyselect_1.2.0
## [91] bit_4.0.5 compiler_4.3.1
## [93] curl_5.0.1 BiocNeighbors_1.18.0
## [95] hdf5r_1.3.8 xml2_1.3.5
## [97] DelayedArray_0.26.7 plotly_4.10.2
## [99] rtracklayer_1.60.0 scales_1.2.1
## [101] lmtest_0.9-40 rappdirs_0.3.3
## [103] digest_0.6.33 goftest_1.2-3
## [105] spatstat.utils_3.0-3 rmarkdown_2.24
## [107] XVector_0.40.0 htmltools_0.5.5
## [109] pkgconfig_2.0.3 sparseMatrixStats_1.12.2
## [111] highr_0.10 ensembldb_2.24.0
## [113] fastmap_1.1.1 rlang_1.1.1
## [115] htmlwidgets_1.6.2 shiny_1.7.5
## [117] DelayedMatrixStats_1.22.5 farver_2.1.1
## [119] jquerylib_0.1.4 zoo_1.8-12
## [121] jsonlite_1.8.7 BiocParallel_1.34.2
## [123] R.oo_1.25.0 BiocSingular_1.16.0
## [125] RCurl_1.98-1.12 magrittr_2.0.3
## [127] GenomeInfoDbData_1.2.10 munsell_0.5.0
## [129] viridis_0.6.4 reticulate_1.30
## [131] stringi_1.7.12 zlibbioc_1.46.0
## [133] MASS_7.3-60 plyr_1.8.8
## [135] parallel_4.3.1 listenv_0.9.0
## [137] ggrepel_0.9.3 deldir_1.0-9
## [139] Biostrings_2.68.1 splines_4.3.1
## [141] tensor_1.5 hms_1.1.3
## [143] locfit_1.5-9.8 igraph_1.5.0.1
## [145] spatstat.geom_3.2-4 reshape2_1.4.4
## [147] biomaRt_2.56.1 ScaledMatrix_1.8.1
## [149] BiocVersion_3.17.1 XML_3.99-0.14
## [151] evaluate_0.21 BiocManager_1.30.22
## [153] tzdb_0.4.0 httpuv_1.6.11
## [155] RANN_2.6.1 polyclip_1.10-4
## [157] future_1.33.0 scattermore_1.2
## [159] rsvd_1.0.5 xtable_1.8-4
## [161] AnnotationFilter_1.24.0 restfulr_0.0.15
## [163] later_1.3.1 viridisLite_0.4.2
## [165] beeswarm_0.4.0 memoise_2.0.1
## [167] AnnotationDbi_1.62.2 GenomicAlignments_1.36.0
## [169] cluster_2.1.4 timechange_0.2.0
## [171] globals_0.16.2